Loading the data …
bbb <- readr::read_rds("data/bbb.rds")
register("bbb")
visualize(
bbb,
xvar = "last",
yvar = "buyer",
type = "bar",
labs = list(
y = "Proportion of buyer = 'yes'",
x = "Months since last purchase (last)"
),
custom = FALSE
)
Creating the recency variable rec_iq using the following command in Data > Transform:
rec_iq = xtile(last, 5)
## create new variable(s)
bbb <- mutate(bbb, rec_iq = xtile(last, 5))
Does recency predict purchase? Are the best customers in quintile 1? The graph below shows this is indeed the case.
visualize(
bbb,
xvar = "rec_iq",
yvar = "buyer",
type = "bar",
labs = list(
y = "Proportion of buyer = 'yes'",
x = "Recency quintiles (rec_iq)"
),
custom = FALSE
)
Plots shows that purchase probility is NOT highest in the 1st quantile for frequencey (purch).
visualize(
bbb,
xvar = "purch",
yvar = "buyer",
type = "bar",
labs = list(
y = "Proportion of buyer = 'yes'",
x = "Purchase frequency (purch)"
),
custom = FALSE
)
This means we need to ‘flip’ the bin numbers so the highest purchase probility is in the 1st bin (quantile). The easiest way to do this is to add rev = TRUE in the call to xtile.
freq_iq = xtile(purch, 5, rev = TRUE)
Alternatively, you could use:
freq_iq = 6L - xtile(purch, 5)
## bin variables
bbb <- mutate(bbb, freq_iq = xtile(purch, 5, rev = TRUE))
visualize(
bbb,
xvar = "freq_iq",
yvar = "buyer",
type = "bar",
labs = list(
y = "Proportion of buyer = 'yes'",
x = "Frequency quintiles (freq_iq)"
),
custom = FALSE
)
Why are there only 4 values? Looking at the histogram below we see that the distribution of purch is heavily skewed (to the right). This makes it difficult to create 5 bins of similar size
visualize(bbb, xvar = "purch", color = "freq_iq")
The plot shows that purchase probility is NOT highest in the 1st quantile for monetary (total)
visualize(
bbb,
xvar = "total",
yvar = "buyer",
type = "bar",
labs = list(
y = "Proportion of buyer = 'yes'",
x = "Monetary value (total)"
),
custom = TRUE
) +
theme(axis.ticks = element_blank(), axis.text.x = element_blank())
Just like we did for frequency we have to ‘flip’ quantiles so the highest purchase probility is in the 1st quantile (i.e., add rev = TRUE)
mon_iq = xtile(total, 5, rev = TRUE)
## bin variables
bbb <- mutate(bbb, mon_iq = xtile(total, 5, rev = TRUE))
visualize(
bbb,
xvar = "mon_iq",
yvar = "buyer",
type = "bar",
labs = list(
y = "Proportion of buyer = 'yes'",
x = "Monetary value quantiles (mon_iq)"
),
custom = FALSE
)
Use Data > Transform > Create to generate the RFM index
rfm_iq = paste0(rec_iq, freq_iq, mon_iq)
## create new variable(s)
bbb <- mutate(bbb, rfm_iq = paste0(rec_iq, freq_iq, mon_iq))
visualize(
bbb,
xvar = "rfm_iq",
yvar = "buyer",
type = "bar",
labs = list(
y = "Proportion of buyer = 'yes'",
x = "Independent RFM index (rfm_iq)"
),
custom = TRUE
) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
geom_hline(yintercept = 1/9)
## create new variable(s)
bbb <- group_by(bbb, rec_iq) %>%
mutate(freq_sq = xtile(purch, 5, rev = TRUE)) %>%
ungroup()
## create new variable(s)
bbb <- group_by(bbb, rec_iq, freq_sq) %>%
mutate(mon_sq = xtile(total, 5, rev = TRUE)) %>%
ungroup()
visualize(
bbb,
xvar = "freq_sq",
yvar = "buyer",
type = "bar",
labs = list(
y = "Proportion of buyer = 'yes'",
x = "Frequency quintiles (freq_sq)"
),
custom = FALSE
)
visualize(
bbb,
xvar = "freq_sq",
yvar = "buyer",
type = "bar",
labs = list(
y = "Proportion of buyer = 'yes'",
x = "Frequency quintiles (freq_sq)"
),
custom = FALSE
)
visualize(
bbb,
xvar = "mon_sq",
yvar = "buyer",
type = "bar",
labs = list(
y = "Proportion of buyer = 'yes'",
x = "Monetary value quintiles (mon_sq)"
),
custom = FALSE
)
Use Data > Transform > Create to generate the RFM index
rfm_sq = paste0(rec_iq, freq_sq, mon_sq)
## create new variable(s)
bbb <- mutate(bbb, rfm_sq = paste0(rec_iq, freq_sq, mon_sq))
visualize(
bbb,
xvar = "rfm_sq",
yvar = "buyer",
type = "bar",
labs = list(
y = "Proportion of buyer = 'yes'",
x = "Sequential RFM index (rfm_sq)"
),
custom = TRUE
) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
geom_hline(yintercept = 1/9)
result <- explore(
bbb,
vars = "buyer",
fun = c("n_obs", "mean", "min", "max")
)
summary(result, top = "fun", dec = 4)
Explore
Data : bbb
Functions : n_obs, mean, min, max
Top : Function
variable n_obs mean min max
buyer 50,000 0.0904 0 1
The breakeven value is 11.11%. All cells above the breakeven line in the plot below will be mailed.
visualize(
bbb,
xvar = "rfm_iq",
yvar = "buyer",
type = "bar",
labs = list(
y = "Proportion of buyer = 'yes'",
x = "Independent RFM index (rfm_iq)"
),
custom = TRUE
) +
geom_hline(aes(yintercept = breakeven)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Now that we are creating variables to use in targeting we should use the training variable.
Create the mailto_iq variable for independent RFM
mailto_iq = mean(buyer == "yes") > 1/9
## mail TRUE or FALSE
bbb <- group_by(bbb, rfm_iq) %>%
mutate(mailto_iq = (sum(training == 1 & buyer == "yes") / sum(training == 1)) > breakeven) %>%
ungroup()
## also calculate response rate per group as an illustration
bbb <- group_by(bbb, rfm_iq) %>%
mutate(rfm_iq_resp = sum(training == 1 & buyer == "yes") / sum(training == 1)) %>%
ungroup()
result <- pivotr(
bbb,
cvars = c("mailto_iq", "training"),
fun = "n_obs",
normalize = "column"
)
summary(result, dec = 2, perc = TRUE)
Pivot table
Data : bbb
Categorical : mailto_iq training
Normalize by: column
training FALSE. TRUE. Total
0 29.83% 30.31% 30.00%
1 70.17% 69.69% 70.00%
Total 100.00% 100.00% 100.00%
result <- explore(
bbb,
vars = "buyer",
byvar = c("training", "mailto_iq"),
fun = c("n_obs", "mean")
)
summary(result, dec = 4)
Explore
Data : bbb
Grouped by : training mailto_iq
Functions : n_obs, mean
Top : Function
training mailto_iq variable n_obs mean
0 FALSE buyer 9,704 0.0542
0 TRUE buyer 5,296 0.1531
1 FALSE buyer 22,823 0.0561
1 TRUE buyer 12,177 0.1564
dat <- filter(bbb, training == 0)
perc_mail = mean(dat$mailto_iq)
nr_mail = 500000 * perc_mail
rep_rate <- filter(dat, mailto_iq == TRUE) %>%
summarize(rep_rate = mean(buyer == "yes")) %>%
pull("rep_rate")
nr_resp = nr_mail * rep_rate
mail_cost = 1 * nr_mail
profit = 9 * nr_resp - mail_cost
ROME = profit / mail_cost
Based on independent RFM the number of customers BBB should mail is 176,533 (35.31%). The response rate for the selected customers is predicted to be 15.31% or 27,033 buyers. The expected profit is $66,767. The mailing cost is estimated to be $176,533 with a ROME of 37.82%
Create the mailto_sq variable for sequential RFM
mailto_sq = mean(buyer == "yes") > 1/9
## mail TRUE or FALSE
bbb <- group_by(bbb, rfm_sq) %>%
mutate(mailto_sq = (sum(training == 1 & buyer == "yes") / sum(training == 1)) > breakeven) %>%
ungroup()
## also calculate response rate per group as an illustration
bbb <- group_by(bbb, rfm_sq) %>%
mutate(rfm_sq_resp = sum(training == 1 & buyer == "yes") / sum(training == 1)) %>%
ungroup()
result <- pivotr(
bbb,
cvars = c("mailto_iq", "training"),
fun = "n_obs",
normalize = "column"
)
summary(result, dec = 2, perc = TRUE)
Pivot table
Data : bbb
Categorical : mailto_iq training
Normalize by: column
training FALSE. TRUE. Total
0 29.83% 30.31% 30.00%
1 70.17% 69.69% 70.00%
Total 100.00% 100.00% 100.00%
result <- explore(
bbb,
vars = "buyer",
byvar = c("training", "mailto_iq"),
fun = c("n_obs", "mean")
)
summary(result, dec = 4)
Explore
Data : bbb
Grouped by : training mailto_iq
Functions : n_obs, mean
Top : Function
training mailto_iq variable n_obs mean
0 FALSE buyer 9,704 0.0542
0 TRUE buyer 5,296 0.1531
1 FALSE buyer 22,823 0.0561
1 TRUE buyer 12,177 0.1564
dat <- bbb
perc_mail = mean(dat$mailto_sq)
nr_mail = 500000 * perc_mail
rep_rate <- filter(dat, mailto_sq == TRUE) %>%
summarize(rep_rate = mean(buyer == "yes")) %>%
pull("rep_rate")
nr_resp = nr_mail * rep_rate
mail_cost_sq = 1 * nr_mail
profit_sq = 9 * nr_resp - mail_cost_sq
ROME_sq = profit_sq / mail_cost_sq
Based on sequential RFM the number of customers BBB should mail is 166,980 (33.40%). The response rate for the selected customers is predicted to be 15.80% or 26,390 buyers. The expected profit is $70,530. The mailing cost is estimated to be $166,980 with a ROME of 42.24%.
Compare this to the main results from independent RFM. The expected profit is $66,767. The mailing cost is estimated to be $176,533 with a ROME of 37.82%
If we select the predicted response rate for both of the rfm indices (i.e., rfm_iq_resp and rfm_iq_resp), and select a profit and ROME plot in Model > Evaluate Classification you should see the plots below. A visual inspection suggests that profits will be maximized if we target the top 35% of customers (approximately). You already calculated the exact percentages above. You should find that the number you calculated is very similar to the numbers highlighted green in the performance.xls file on Dropbox.
result <- evalbin(
bbb,
pred = c("rfm_iq_resp", "rfm_sq_resp"),
rvar = "buyer",
lev = "yes",
qnt = 50,
margin = 9,
train = "Test",
data_filter = "training == 1"
)
summary(result, prn = FALSE)
Evaluate predictions for binary response models
Data : bbb
Filter : training == 1
Results for : Test
Predictors : rfm_iq_resp, rfm_sq_resp
Response : buyer
Level : yes in buyer
Bins : 50
Cost:Margin : 1 : 9
plot(result, plots = "profit", custom = FALSE)
result <- logistic(
bbb,
rvar = "buyer",
evar = "rfm_sq",
lev = "yes",
data_filter = "training == 1"
)
summary(result)
Logistic regression (GLM)
Data : bbb
Filter : training == 1
Response variable : buyer
Level : yes in buyer
Explanatory variables: rfm_sq
Null hyp.: there is no effect of rfm_sq on buyer
Alt. hyp.: there is an effect of rfm_sq on buyer
OR coefficient std.error z.value p.value
(Intercept) -1.083 0.144 -7.548 < .001 ***
rfm_sq|112 0.941 -0.061 0.202 -0.300 0.764
rfm_sq|113 0.903 -0.103 0.211 -0.487 0.626
rfm_sq|114 0.807 -0.214 0.211 -1.015 0.310
rfm_sq|115 0.826 -0.191 0.213 -0.900 0.368
rfm_sq|121 0.609 -0.496 0.208 -2.381 0.017 *
rfm_sq|122 0.642 -0.444 0.209 -2.126 0.033 *
rfm_sq|123 0.513 -0.668 0.215 -3.111 0.002 **
rfm_sq|124 0.694 -0.365 0.204 -1.790 0.073 .
rfm_sq|125 0.644 -0.439 0.202 -2.172 0.030 *
rfm_sq|131 0.502 -0.690 0.388 -1.778 0.075 .
rfm_sq|132 0.591 -0.526 0.392 -1.342 0.180
rfm_sq|133 0.568 -0.566 0.374 -1.512 0.130
rfm_sq|134 0.543 -0.611 0.390 -1.568 0.117
rfm_sq|135 1.092 0.088 0.318 0.276 0.783
rfm_sq|141 0.525 -0.645 0.192 -3.366 < .001 ***
rfm_sq|142 0.543 -0.610 0.193 -3.160 0.002 **
rfm_sq|143 0.427 -0.851 0.198 -4.290 < .001 ***
rfm_sq|144 0.419 -0.870 0.201 -4.336 < .001 ***
rfm_sq|145 0.417 -0.875 0.200 -4.378 < .001 ***
rfm_sq|151 0.472 -0.751 0.196 -3.832 < .001 ***
rfm_sq|152 0.355 -1.036 0.208 -4.972 < .001 ***
rfm_sq|153 0.413 -0.885 0.201 -4.410 < .001 ***
rfm_sq|154 0.317 -1.149 0.210 -5.468 < .001 ***
rfm_sq|155 0.393 -0.934 0.204 -4.582 < .001 ***
rfm_sq|211 0.793 -0.232 0.206 -1.125 0.261
rfm_sq|212 0.694 -0.365 0.212 -1.724 0.085 .
rfm_sq|213 0.572 -0.559 0.221 -2.526 0.012 *
rfm_sq|214 0.828 -0.189 0.205 -0.921 0.357
rfm_sq|215 0.633 -0.457 0.211 -2.170 0.030 *
rfm_sq|221 0.445 -0.810 0.224 -3.620 < .001 ***
rfm_sq|222 0.625 -0.469 0.215 -2.180 0.029 *
rfm_sq|223 0.565 -0.570 0.224 -2.541 0.011 *
rfm_sq|224 0.464 -0.768 0.226 -3.400 < .001 ***
rfm_sq|225 0.253 -1.375 0.264 -5.199 < .001 ***
rfm_sq|241 0.236 -1.444 0.235 -6.133 < .001 ***
rfm_sq|242 0.282 -1.265 0.221 -5.731 < .001 ***
rfm_sq|243 0.301 -1.202 0.219 -5.478 < .001 ***
rfm_sq|244 0.198 -1.620 0.248 -6.525 < .001 ***
rfm_sq|245 0.338 -1.085 0.212 -5.124 < .001 ***
rfm_sq|251 0.279 -1.276 0.226 -5.651 < .001 ***
rfm_sq|252 0.304 -1.192 0.218 -5.471 < .001 ***
rfm_sq|253 0.254 -1.371 0.231 -5.927 < .001 ***
rfm_sq|254 0.209 -1.563 0.240 -6.518 < .001 ***
rfm_sq|255 0.176 -1.738 0.254 -6.830 < .001 ***
rfm_sq|311 0.551 -0.596 0.207 -2.873 0.004 **
rfm_sq|312 0.455 -0.787 0.215 -3.660 < .001 ***
rfm_sq|313 0.452 -0.794 0.215 -3.692 < .001 ***
rfm_sq|314 0.448 -0.803 0.212 -3.783 < .001 ***
rfm_sq|315 0.361 -1.019 0.224 -4.552 < .001 ***
rfm_sq|321 0.247 -1.399 0.244 -5.745 < .001 ***
rfm_sq|322 0.380 -0.968 0.221 -4.381 < .001 ***
rfm_sq|323 0.263 -1.334 0.244 -5.470 < .001 ***
rfm_sq|324 0.362 -1.015 0.222 -4.569 < .001 ***
rfm_sq|325 0.269 -1.315 0.241 -5.449 < .001 ***
rfm_sq|341 0.290 -1.239 0.212 -5.842 < .001 ***
rfm_sq|342 0.253 -1.376 0.215 -6.384 < .001 ***
rfm_sq|343 0.167 -1.789 0.244 -7.319 < .001 ***
rfm_sq|344 0.150 -1.896 0.250 -7.578 < .001 ***
rfm_sq|345 0.164 -1.807 0.241 -7.485 < .001 ***
rfm_sq|351 0.127 -2.062 0.261 -7.908 < .001 ***
rfm_sq|352 0.175 -1.740 0.236 -7.360 < .001 ***
rfm_sq|353 0.180 -1.713 0.234 -7.318 < .001 ***
rfm_sq|354 0.120 -2.118 0.265 -7.999 < .001 ***
rfm_sq|355 0.140 -1.967 0.253 -7.763 < .001 ***
rfm_sq|411 0.510 -0.674 0.247 -2.727 0.006 **
rfm_sq|412 0.474 -0.746 0.255 -2.922 0.003 **
rfm_sq|413 0.475 -0.744 0.249 -2.986 0.003 **
rfm_sq|414 0.211 -1.556 0.321 -4.848 < .001 ***
rfm_sq|415 0.341 -1.076 0.271 -3.965 < .001 ***
rfm_sq|421 0.233 -1.456 0.304 -4.786 < .001 ***
rfm_sq|422 0.227 -1.482 0.304 -4.875 < .001 ***
rfm_sq|423 0.182 -1.705 0.330 -5.163 < .001 ***
rfm_sq|424 0.277 -1.282 0.291 -4.399 < .001 ***
rfm_sq|425 0.154 -1.872 0.355 -5.277 < .001 ***
rfm_sq|441 0.164 -1.807 0.294 -6.143 < .001 ***
rfm_sq|442 0.193 -1.647 0.282 -5.831 < .001 ***
rfm_sq|443 0.150 -1.899 0.301 -6.308 < .001 ***
rfm_sq|444 0.147 -1.920 0.309 -6.210 < .001 ***
rfm_sq|445 0.101 -2.294 0.352 -6.516 < .001 ***
rfm_sq|451 0.125 -2.081 0.328 -6.348 < .001 ***
rfm_sq|452 0.116 -2.157 0.339 -6.360 < .001 ***
rfm_sq|453 0.105 -2.249 0.339 -6.639 < .001 ***
rfm_sq|454 0.164 -1.807 0.294 -6.143 < .001 ***
rfm_sq|455 0.057 -2.865 0.436 -6.564 < .001 ***
rfm_sq|511 0.307 -1.181 0.258 -4.578 < .001 ***
rfm_sq|512 0.115 -2.162 0.369 -5.862 < .001 ***
rfm_sq|513 0.214 -1.540 0.283 -5.438 < .001 ***
rfm_sq|514 0.200 -1.612 0.303 -5.320 < .001 ***
rfm_sq|515 0.151 -1.893 0.318 -5.944 < .001 ***
rfm_sq|521 0.137 -1.985 0.354 -5.609 < .001 ***
rfm_sq|522 0.082 -2.505 0.438 -5.719 < .001 ***
rfm_sq|523 0.134 -2.013 0.370 -5.444 < .001 ***
rfm_sq|524 0.014 -4.269 1.013 -4.216 < .001 ***
rfm_sq|525 0.108 -2.223 0.411 -5.413 < .001 ***
rfm_sq|531 0.000 -13.483 120.126 -0.112 0.911
rfm_sq|532 0.000 -13.483 124.839 -0.108 0.914
rfm_sq|533 0.059 -2.829 1.020 -2.773 0.006 **
rfm_sq|534 0.056 -2.887 1.020 -2.832 0.005 **
rfm_sq|535 0.109 -2.213 0.734 -3.014 0.003 **
rfm_sq|541 0.049 -3.017 0.436 -6.921 < .001 ***
rfm_sq|542 0.049 -3.011 0.436 -6.908 < .001 ***
rfm_sq|543 0.067 -2.699 0.366 -7.365 < .001 ***
rfm_sq|544 0.055 -2.898 0.408 -7.110 < .001 ***
rfm_sq|545 0.023 -3.766 0.597 -6.308 < .001 ***
rfm_sq|551 0.071 -2.644 0.367 -7.212 < .001 ***
rfm_sq|552 0.080 -2.525 0.351 -7.191 < .001 ***
rfm_sq|553 0.033 -3.425 0.523 -6.551 < .001 ***
rfm_sq|554 0.042 -3.177 0.473 -6.721 < .001 ***
rfm_sq|555 0.042 -3.177 0.473 -6.721 < .001 ***
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Pseudo R-squared: 0.066
Log-likelihood: -9960.957, AIC: 20141.914, BIC: 21072.856
Chi-squared: 1417.262 df(109), p.value < .001
Nr obs: 35,000
pred <- predict(result, pred_data = bbb)
print(pred, n = 10)
Logistic regression (GLM)
Data : bbb
Filter : training == 1
Response variable : buyer
Level(s) : yes in buyer
Explanatory variables: rfm_sq
Interval : confidence
Prediction dataset : bbb
Rows shown : 10 of 50,000
rfm_sq Prediction 2.5% 97.5%
512 0.037 0.020 0.070
534 0.019 0.003 0.120
443 0.048 0.029 0.078
251 0.086 0.063 0.117
453 0.034 0.019 0.061
254 0.066 0.046 0.094
555 0.014 0.006 0.033
114 0.215 0.168 0.270
111 0.253 0.204 0.310
354 0.039 0.026 0.059
bbb <- store(bbb, pred, name = "predict_logit")
visualize(
bbb,
xvar = "rfm_sq",
yvar = "predict_logit",
type = "bar",
labs = list(
y = "Predicted purchase probability",
x = "Logistic regression with Sq. RFM"
),
data_filter = "training == 0",
custom = TRUE
) +
geom_hline(aes(yintercept = breakeven)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))